# Downloading subtitles
library(dplyr)
library(rvest)
library(utils)
library(downloader)
set.seed(42)

# Loading the dataset
### Available at: https://www.kaggle.com/datasets/ashirwadsangwan/imdb-dataset
df <- read.csv("title.basics.tsv", header = T, sep = "\t", encoding = "UTF-8")
df <- sample_n(df, nrow(df))
head(df)
##      tconst titleType                                 primaryTitle
## 1 tt3100548     video              Basta che non si sappia in giro
## 2 tt5453150 tvEpisode                              Episode #1.3160
## 3 tt6716786 tvEpisode                                Episode #1.15
## 4 tt1026957 tvEpisode                              Episode #1.8632
## 5 tt3063346 tvEpisode Outdoor Bar, Fire Pit and Creativity Retreat
## 6 tt2752524 tvEpisode                   Teaching Toward the Future
##                                  originalTitle isAdult startYear endYear
## 1              Basta che non si sappia in giro       1      2009     \\N
## 2                              Episode #1.3160       0      1979     \\N
## 3                                Episode #1.15       0      2015     \\N
## 4                              Episode #1.8632       0      2007     \\N
## 5 Outdoor Bar, Fire Pit and Creativity Retreat       0      2013     \\N
## 6                   Teaching Toward the Future       0      2013     \\N
##   runtimeMinutes         genres
## 1            \\N          Adult
## 2            \\N          Drama
## 3            \\N  Drama,Romance
## 4            \\N  Drama,Romance
## 5             21     Reality-TV
## 6            \\N News,Talk-Show
# Dropping rows where the type is not movie or genre is missing  
df <- df[df$titleType == "movie" & df$genres != "\\N",]

# Keeping only the primary genre
df$main_genre <- gsub(",.*", "", df$genres)

# Deleting special characters from the beginning of the title
df$primaryTitle <- gsub("[^a-zA-Z0-9 á-űÁ-Ű]", "", df$primaryTitle)
head(df)
##        tconst titleType                    primaryTitle
## 23  tt0258586     movie                La foret qui tue
## 28  tt0299057     movie                      Nayezdniki
## 36  tt2006141     movie                            1959
## 68  tt0400230     movie Biography of Mario Vargas Llosa
## 84  tt0338855     movie                   Chhote Sarkar
## 100 tt0008188     movie             The Little American
##                       originalTitle isAdult startYear endYear
## 23                 La foret qui tue       0      1927     \\N
## 28                       Nayezdniki       0      1987     \\N
## 36                             1959       0      2016     \\N
## 68  Biography of Mario Vargas Llosa       0      2004     \\N
## 84                    Chhote Sarkar       0      1996     \\N
## 100             The Little American       0      1917     \\N
##     runtimeMinutes                  genres  main_genre
## 23             \\N                   Drama       Drama
## 28              50                   Drama       Drama
## 36              88                Thriller    Thriller
## 68              75             Documentary Documentary
## 84             151 Comedy,Romance,Thriller      Comedy
## 100             80       Drama,Romance,War       Drama
##### Downloading subtitles from yifysubtitles.org
root <- "https://yifysubtitles.org"
url <- "https://yifysubtitles.org/search?q="


# Wait for some secs while checking temp directory's content
io_wait <- function(max=2){
    waited = 0
    if(length(list.files("./subtitles/temp/")) == 0 &
       waited <= max){
        Sys.sleep(0.1)
        waited = waited + 0.1
    }
}


# Function to get download site and title
get_download_site_and_title <- function(search_url){
    download_site <-  tryCatch({
        subtitle_site <<- read_html(search) %>% 
            html_node(".media-body") %>%
            html_node("a") %>% html_attr("href") %>% paste(root, ., sep = "") %>% 
            read_html()
        
        # Get title from the site
        s_title <<- subtitle_site %>% html_node(".movie-main-title") %>% html_text()
        
        subtitle_site %>% html_nodes(".table.other-subs") %>%
            html_nodes("a") %>% html_attr("href") %>% grep("subtitle.*eng", ., value = T) %>% .[1] %>% 
            paste(root, ., sep="") %>% read_html() %>% html_nodes(".download-subtitle") %>% 
            html_attr("href") %>% paste(root, ., sep="")
        }, error = function(e){NULL})
    return(download_site)
}


# Function to download and unzip the subtitle
download_subtitle <- function(download_site, dest, temp_dir, quiet=TRUE){
    tryCatch({
        download(download_site, dest, quiet)
        unzip(dest, exdir = temp_dir)
        io_wait()
        unzipped <- list.files("./subtitles/temp/", full.names = T)
        new_name <- paste("./subtitles/", s_title, ".srt", sep="") %>% 
                    gsub(":|\\|\\?", " ", .)
        file.rename(unzipped[1], new_name)
        unlink(dest)
        unlink(temp_dir, recursive = T)
    }, error = function(e){skip <<- T})
}


# Create needed directories 
create_dirs <- function(directories){
    for (d in directories){
        if (!dir.exists(d)){
            dir.create(d)
        }
    }
}
dirs <- c("subtitles", "subtitles/temp")
create_dirs(dirs)


# Downloading subtitles
n <- 1000
for (i in 1:nrow(df)){
    
    if (length(list.files("./subtitles")) >= n){
        break}
    
    title <- df$primaryTitle[i]
    # print(paste(round(i/nrow(df), 2), title, sep="    "))
    search <- URLencode(paste(url, title, sep=""))
    skip <- F
    
    download_site <- get_download_site_and_title(search)
    
    if(!is.null(download_site)){
        filedest <- paste("./subtitles/", title, ".zip", sep="")
        
        download_subtitle(download_site, filedest, "subtitles/temp")

        if (skip){
            unlink(filedest)
            unlink("./subtitles/temp", recursive = T)
            next}
    }
}

# Basic sentiment analysis
library(srt)
library(sentimentr)
library(ggplot2)
library(RColorBrewer)
library(reshape2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:sentimentr':
## 
##     highlight
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
# Get a sample of subtitles
subs <- list.files("subtitles/", full.names = T) %>% sample(1000)


# Reading and converting subtitles to a character vector
subtitle <- read_srt(subs[1])
subtitle <- subtitle$subtitle
title <- gsub("subtitles/|\\.srt|\\(.*", "", subs[1]) %>% stringi::stri_trim()
year <- gsub(".*\\(|\\)\\.srt", "", subs[1])
text <- paste(subtitle, collapse = " ") %>% gsub("\n", " ", .)


# Getting sentiment scores for a subtitle and creating a data frame row from them
get_sentiment_row <- function(text, title, year){
    avg_sentiment <- sentiment_by(text)$ave_sentiment
    
    emos <- emotion_by(text)
    emos <- reshape(emos[,c("element_id", "emotion_type", "ave_emotion")],
                    idvar = "element_id",
                    timevar = "emotion_type", direction = "wide")
    
    names(emos) <- gsub("ave_emotion\\.", "", names(emos))
    
    avg_profanity <- profanity_by(text)$ave_profanity
    
    sentiment_df <- data.frame("title"=title, "year"=year, "avg_sentiment"=avg_sentiment,
                               emos, "profanity" = avg_profanity)
    return(sentiment_df)
}

# Creating original data frame with 1 row
senti_df <- get_sentiment_row(text, title, year=year)


# Deleting path and extension from titles
titles <- gsub("subtitles/|\\.srt|\\(.*", "", subs) %>% stringi::stri_trim()
# Getting year of the movie
years <- gsub(".*\\(|\\)\\.srt", "", subs)


# Appending rows to the data frame if it does not exists as a csv

if (file.exists("sentiment_scores.csv")){
    senti_df <- read.csv("sentiment_scores.csv")
} else{
# Creating progress bar
    pb <- txtProgressBar(min = 0, max = length(subs), style = 3, width = 50, char = "=")
    for (i in 2:length(subs)){
        setTxtProgressBar(pb, i)
        skip <- F
        sub <- subs[i]
        title <- titles[i]
        year <- years[i]
        
        tryCatch({ # Error handling is needed because some subtitle files are empty
            subtitle <- read_srt(sub)
            subtitle <- subtitle$subtitle
            text <- paste(subtitle, collapse = " ") %>% gsub("\n", " ", .)
            row <- get_sentiment_row(text, title, year)
            if (rowSums(row[,-c(1:2, 4)]) != 0){
                senti_df <- rbind(senti_df, row)}
        },error = function(e){skip <<- T})
        if (skip){next}
    }
    close(pb)
    write.csv(senti_df, "sentiment_scores.csv", row.names = F)
}
# Changing data type of year column
senti_df$year <- as.integer(senti_df$year)
df$startYear <- as.integer(df$startYear)
head(senti_df)
##                    title year avg_sentiment element_id      anger
## 1          Harlem Nights 1989   0.014638470          1 0.02153982
## 2          Shallow Grave 1987   0.006460613          1 0.01657917
## 3             Love Songs 2007   0.005476916          1 0.01570211
## 4    The Future Is Woman 1984   0.091300115          1 0.01151742
## 5 Ambush at Tomahawk Gap 1953   0.015940097          1 0.01758555
## 6             Night Walk 2019   0.062349143          1 0.01263872
##   anger_negated anticipation anticipation_negated     disgust
## 1   0.002651055   0.02087706          0.001325527 0.013697117
## 2   0.002348715   0.02238187          0.002072396 0.012987013
## 3   0.002093614   0.02497383          0.003589053 0.013907582
## 4   0.001727613   0.02533832          0.002879355 0.008350130
## 5   0.005703422   0.01568441          0.004039924 0.005703422
## 6   0.002055076   0.02168105          0.002671599 0.007912043
##   disgust_negated       fear fear_negated         joy  joy_negated
## 1     0.002540594 0.02098752  0.001767370 0.019661991 0.0009941456
## 2     0.001934236 0.02417795  0.002763194 0.026802984 0.0019342360
## 3     0.001196351 0.01869299  0.001345895 0.025123374 0.0035890534
## 4     0.001439678 0.01526058  0.002015549 0.030521163 0.0017276130
## 5     0.001188213 0.01972433  0.004277567 0.008792776 0.0045152091
## 6     0.001027538 0.01798192  0.002774353 0.018392931 0.0006165228
##      sadness sadness_negated    surprise surprise_negated      trust
## 1 0.01866784     0.001215067 0.013476196     0.0009941456 0.01922015
## 2 0.02998066     0.003177673 0.010223819     0.0011052777 0.02763194
## 3 0.01914162     0.003589053 0.016001196     0.0013458950 0.02153432
## 4 0.01727613     0.003455226 0.015836453     0.0011517420 0.02533832
## 5 0.01473384     0.004039924 0.009743346     0.0054657795 0.01972433
## 6 0.01222770     0.002671599 0.009145088     0.0007192766 0.03185368
##   trust_negated    profanity
## 1   0.001104606 0.0185573843
## 2   0.003039514 0.0044211108
## 3   0.002990878 0.0040376851
## 4   0.001727613 0.0020155485
## 5   0.006653992 0.0002376426
## 6   0.001541307 0.0083230580
# Joining rating dataset
ratings <- read.csv("title.ratings.tsv", header = T, sep="\t")
df <- inner_join(df,ratings)
## Joining, by = "tconst"
# Column to be used
cols <- c("tconst","primaryTitle", "startYear", "main_genre", "averageRating")


# Joining the two dataframes
senti <- inner_join(senti_df, df[,cols], by=c("title"="primaryTitle", "year"="startYear"))

# Multiplying sentiment scores by 100 for 
# senti[, names(senti)[5:21]] <- senti[, names(senti)[5:21]]*100
# Saving data frame as a csv
write.csv(senti, "sentiment_db.csv", row.names = F)
senti <- read.csv("sentiment_db.csv")
names(senti)
##  [1] "title"                "year"                 "avg_sentiment"       
##  [4] "element_id"           "anger"                "anger_negated"       
##  [7] "anticipation"         "anticipation_negated" "disgust"             
## [10] "disgust_negated"      "fear"                 "fear_negated"        
## [13] "joy"                  "joy_negated"          "sadness"             
## [16] "sadness_negated"      "surprise"             "surprise_negated"    
## [19] "trust"                "trust_negated"        "profanity"           
## [22] "tconst"               "main_genre"           "averageRating"
# Subsetting genre column to get genres with at least 25 elements
genres <- table(senti$main_genre)[table(senti$main_genre) > 25] %>% names()
senti <- senti[senti$main_genre %in% genres, ]

# Plotting sentiment scores across genres
ggplot(senti[senti$avg_sentiment<0.5, ], aes(avg_sentiment, main_genre, fill=main_genre)) +
    geom_boxplot(alpha=1, outlier.shape = NA) +
    geom_jitter(aes(size=profanity, colour=surprise), alpha=0.2) +
    scale_colour_gradient(low = "white", high = "red")+
    coord_flip()+
    guides(fill = "none")+
    ylab("Genre") + xlab("Sentiment") +
    theme(axis.title = element_text(size=20),
          axis.text = element_text(size=10),
          legend.text = element_text(size=10))

# Creating time series of sentiment scores
ts <- aggregate(cbind(anger,anticipation,disgust,fear,
                      joy,sadness,surprise,trust,profanity) ~ 
                    year, senti, mean)

# Converting to long format
ts <- melt(ts, id="year") 

# Plotting time series
tplot <- ggplot(ts[ts$year>1930,], aes(year, value, colour = variable)) +
    geom_line()
ggplotly(tplot)